Analysis of Head and Neck Squamous Cell Carcinomas Data

Group 21

Asta Zeuner s203544
Astrid Ginnerup s203523
Emma Danø s203564
Andrea Kristensen s193769
Yayi Wang s243554

Introduction

  • Background
    • Head and neck squamous cell carcinomas (HNSCC) are a group of malignancies affecting the mucosal surfaces of the head and neck.
    • A focus on phenotype, survival data, and protein expression quantification from the UCSC Xena.
  • Research Objective
    • Apply Tidyverse techniques learned in class to real-world data.
    • Extract and uncover scientific insights from the HNSCC datasets.

Materials and methods

Cleaning and Augmenting Data

  • Download and naming

  • Join tables

  • Cleaning

    • Choosing variables and removing TCGA in sample_id
  • Augmenting

    • Creating patient_id from sample_id

    • Creating 5 and 10 years age interval

    • Creating expression mean variables

    • Creating binominal variable for alcohol history

library("tidyverse")
library("table1")
library("here")

raw_dir <- here("data/_raw/")
data_file <- "TCGA-HNSC.protein.tsv.gz"
data_loc <- "https://gdc-hub.s3.us-east-1.amazonaws.com/download/"

if( !dir.exists(raw_dir) ){
  dir.create(path = raw_dir)
}
if( !file.exists(str_c(raw_dir, data_file)) ){
  download.file(
    url = str_c(data_loc, data_file),
    destfile = str_c(raw_dir, data_file))
}

raw_dir <- here("data/_raw/")
data_file <- "TCGA-HNSC.survival.tsv.gz"
data_loc <- "https://gdc-hub.s3.us-east-1.amazonaws.com/download/"

if( !dir.exists(raw_dir) ){
  dir.create(path = raw_dir)
}
if( !file.exists(str_c(raw_dir, data_file)) ){
  download.file(
    url = str_c(data_loc, data_file),
    destfile = str_c(raw_dir, data_file))
}

raw_dir <- here("data/_raw/")
data_file <- "TCGA-HNSC.clinical.tsv.gz"
data_loc <- "https://gdc-hub.s3.us-east-1.amazonaws.com/download/"

if( !dir.exists(raw_dir) ){
  dir.create(path = raw_dir)
}
if( !file.exists(str_c(raw_dir, data_file)) ){
  download.file(
    url = str_c(data_loc, data_file),
    destfile = str_c(raw_dir, data_file))
}


protein_data <- read_tsv(file = here("data/_raw/TCGA-HNSC.protein.tsv.gz"))
survival_data <- read_tsv(file = here("data/_raw/TCGA-HNSC.survival.tsv.gz"))
clinical_data <- read_tsv(file = here("data/_raw/TCGA-HNSC.clinical.tsv.gz"))

meta_data <- inner_join(clinical_data, survival_data, by = c("sample" = "sample"))

data_dir <- here("data/")

write_tsv(x = meta_data, 
          file = str_c(data_dir, 
                       "01_meta_data_load.tsv.gz"))

# Pivot the peptide expression data longer
peptide_long <- protein_data |> 
                pivot_longer(cols = -1, 
                             names_to = "sample", 
                             values_to = "expression")

data_dir <- here("data/")

write_tsv( x = peptide_long,
           file = str_c(data_dir, 
                  "02_peptide_expression_load.tsv.gz"))

final_data <- inner_join(meta_data, 
                         peptide_long, 
                         by = "sample")

data_dir <- here("data/")


write_tsv( x = final_data,
           file = str_c(data_dir, 
                  "03_expression_meta_data_load.tsv.gz"))

# Choosing variables
expression_meta_data_clean_1 <- expression_meta_data_clean |> 
  select(sample_id = sample,
         gender = gender.demographic,
         race = race.demographic,
         vital_status = vital_status.demographic,
         overall_survival = OS,
         primary_site,
         pathologic_stage = ajcc_pathologic_stage.diagnoses,
         age_at_index = age_at_index.demographic,
         year_of_birth = year_of_birth.demographic,
         year_of_death = year_of_death.demographic,
         tissue_source_location = name.tissue_source_site,
         cigarettes_per_day = cigarettes_per_day.exposures,
         years_smoked = years_smoked.exposures,
         alcohol_history = alcohol_history.exposures,
         peptide_target,
         expression)


# Removing TCGA in sample_id
expression_meta_data_clean_2 <- expression_meta_data_clean_1 |> 
  mutate(sample_id = sub("^TCGA-(.*)", "\\1", sample_id))


# Creating patient_id from sample_id
expression_meta_data_aug_1 <- expression_meta_data_aug |> 
  mutate(patient_id = sub("(.*)-..", "\\1", sample_id)) |> 
  relocate(patient_id, .before = sample_id)


# Creating 5 and 10 years age intervals as variables
expression_meta_data_aug_2 <- expression_meta_data_aug_1 |> 
  mutate(age_interval_5_year = cut(age_at_index,
                                   breaks = seq(0, 100, by = 5),
                                   labels = paste(seq(0, 95, by = 5),
                                                  seq(5, 100, by = 5) - 1,
                                                  sep = "-"), 
                                   include.lowest = TRUE), 
          age_interval_10_year = cut(age_at_index,
                                     breaks = seq(0, 100, by = 10),
                                     labels = paste(seq(0, 90, by = 10),
                                                    seq(10, 100, by = 10) - 1,
                                                    sep = "-"), 
                                     include.lowest = TRUE)) |> 

# Placing variables after age_at_index variable
  relocate(age_interval_5_year, .after = age_at_index) |> 
  relocate(age_interval_10_year, .after = age_interval_5_year)


# Creating expression mean variable
expression_meta_data_aug_3 <- expression_meta_data_aug_2 |> 
  group_by(peptide_target) |> 
  mutate(mean_expression = mean(expression, na.rm = TRUE)) |> 
  ungroup() # Removing the grouping to ensure that the following work is not messed up


# Creating expression mean variable for each pathologic state
mean_of_expression_pathologic_stages <- expression_meta_data_aug_3 |> 
  filter(!is.na(pathologic_stage)) |>  
  group_by(peptide_target, pathologic_stage) |> 
  summarise(mean_expression = mean(expression, na.rm = TRUE), .groups = "drop") |> 
  pivot_wider(names_from = pathologic_stage,
              values_from = mean_expression,
              names_prefix = "mean_expression_")

# Join the calculated means back into the original dataset with the join function
expression_meta_data_aug_4 <- expression_meta_data_aug_3 |> 
  right_join(mean_of_expression_pathologic_stages, by = "peptide_target")


# Creating binominal variable for alcohol history
expression_meta_data_aug_5 <- expression_meta_data_aug_4 |> 
  mutate(alcohol_history_binomial = case_when(alcohol_history == "No" ~ 0,
                                              alcohol_history == "Yes" ~ 1)) |> 
  relocate(alcohol_history_binomial, .after = alcohol_history)

Description

  • Check columns and rows (98 columns vs 26 columns, 353 patients)
  • Check Pathologic Stage (5 stages)
  • Check Patients statistics and distribution
# A tibble: 1 × 2
  mean_age middle_age
     <dbl>      <dbl>
1     61.1         61

Analysis heatmap

  • Selecting top up- and downregulated peptide targets with filter(), group_by(), summarize() and slice_max() and slice_min()
  • Extract the peptide_target levels as vector with pull()
  • Filter the main dataset for the selected peptides with semi_join()

Analysis linear model - Vital stage

expression_meta_data_lm_vital_nested_model <- expression_meta_data_lm_vital_nested |> 
  mutate(model_object = map(.x = data,
                            .f = ~lm(formula = expression ~ overall_survival,
                                   data = .x)))

expression_meta_data_lm_vital_estimates_clean <- expression_meta_data_lm_vital_estimates |> 
  filter(term == "overall_survival") |> 
  select(peptide_target,
         p.value,
         estimate,
         conf.low,
         conf.high) |> 
  ungroup(peptide_target)

Analysis Linear model - Alcohol history

Analysis visual

Analysis PCA

protein_data_trans <- protein_data |> 
  drop_na() |> 
  pivot_longer(col = -1,
               names_to = "sample",
               values_to = "expression") |> 
  pivot_wider(names_from = peptide_target,
              values_from = expression)

pca_analysis <- protein_data_trans |> 
  select_if(is.numeric) |> 
  drop_na() |> 
  prcomp(center = TRUE,
         scale = TRUE )

Discussion

  • Indications of what proteins to look further into

  • Limitations in dataset

    • Lack of continous variables restricts possibilities of visualization